years <- seq(2014, 2019)
#years <- seq(2014, 2020)
forprofit_counts <- profit_states_1419 %>%
dplyr::select(contains("count_Forprofit_")) %>%
summarize(across(everything(), sum, na.rm = TRUE))
Warning: There was 1 warning in `summarize()`.
ℹ In argument: `across(everything(), sum, na.rm = TRUE)`.
Caused by warning:
! The `...` argument of `across()` is deprecated as of dplyr 1.1.0.
Supply arguments directly to `.fns` through an anonymous function instead.
# Previously
across(a:b, mean, na.rm = TRUE)
# Now
across(a:b, \(x) mean(x, na.rm = TRUE))
This warning is displayed once every 8 hours.
Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
nonprofit_counts <- profit_states_1419 %>%
dplyr::select(contains("count_Nonprofit_")) %>%
summarize(across(everything(), sum, na.rm = TRUE))
govt_counts <- profit_states_1419 %>%
dplyr::select(contains("count_Government_")) %>%
summarize(across(everything(), sum, na.rm = TRUE))
plot(years, forprofit_counts, type = 'l', main= "Treatment Facility Counts over time", ylim = c(1000, 8000), col = 'blue', xlab = 'Year', ylab = '# Treatment Centers')
lines(years, nonprofit_counts, type = 'l', main= "Non Profit Counts", col = 'red')
lines(years, govt_counts, type = 'l', main= "Government Counts", col = 'green')
legend("bottomleft", c("for profit", "non profit", "government"), col = c("blue", "red", "green"), pch ='-', lwd = 2)

using map tutorial from: https://jtr13.github.io/cc19/different-ways-of-plotting-u-s-map-in-r.html#using-ggplot2-package
state abbreviations from: https://www.ssa.gov/international/coc-docs/states.html
library(ggplot2)
library(maps)
Attaching package: ‘maps’
The following object is masked from ‘package:purrr’:
map
library(mapdata)
library(plotly)
Registered S3 method overwritten by 'data.table':
method from
print.data.table
Registered S3 method overwritten by 'htmlwidgets':
method from
print.htmlwidget tools:rstudio
Attaching package: ‘plotly’
The following object is masked from ‘package:ggplot2’:
last_plot
The following object is masked from ‘package:stats’:
filter
The following object is masked from ‘package:graphics’:
layout
library(sf)
Linking to GEOS 3.11.0, GDAL 3.5.3, PROJ 9.1.0; sf_use_s2() is TRUE
usa <- map_data('usa')
state <- map_data("state")
#merge state geo file to the for-profit information
state <- state %>%
left_join(state_abbrevs_df, by = c("region" = "state_names")) %>%
left_join(profit_states_1419, by = c('state_abbrevs' = 'STATE'))
#plot the number of for-profit treatment centers from 2015-2019
for (i in seq(2015, 2019))
{
plot_forprofits <- ggplot(data=state, aes(x=long, y=lat, fill= !!sym(paste0("count_Forprofit_", i, sep = "")), group=group)) +
scale_fill_gradient(low = "grey", high = "green", limits = c(0,800)) +
geom_polygon(color = "white") +
ggtitle(paste('# For-Profit Treatment Centers in', i)) +
coord_fixed(1.3)
print(plot_forprofits)
}





ggplot(data=state, aes(x=long, y=lat, fill= change_Forprofit_1519, group=group)) +
scale_fill_gradient(low = "grey", high = "green", limits = c(0,400)) +
geom_polygon(color = "white") +
ggtitle('Change in # For-Profit Treatment Centers from 2015 - 2019') +
coord_fixed(1.3)

for (i in seq(2015, 2019))
{
plot_nonprofits <- ggplot(data=state, aes(x=long, y=lat, fill= !!sym(paste0("count_Nonprofit_", i, sep = "")), group=group)) +
scale_fill_gradient(low = "grey", high = "blue", limits = c(0,900)) +
geom_polygon(color = "white") +
ggtitle(paste('# Non-Profit Treatment Centers in', i)) +
coord_fixed(1.3)
print(plot_nonprofits)
}





ggplot(data=state, aes(x=long, y=lat, fill= change_Nonprofit_1519, group=group)) +
scale_fill_gradient(low = "grey", high = "blue", limits = c(0,100)) +
geom_polygon(color = "white") +
ggtitle('Change in # Non-Profit Treatment Centers from 2015 - 2019') +
coord_fixed(1.3)

for (i in seq(2015, 2019))
{
plot_govt <- ggplot(data=state, aes(x=long, y=lat, fill= !!sym(paste0("count_Government_", i, sep = "")), group=group)) +
scale_fill_gradient(low = "grey", high = "red", limits = c(0,250)) +
geom_polygon(color = "white") +
ggtitle(paste('# Government Treatment Centers in', i)) +
coord_fixed(1.3)
print(plot_govt)
}





ggplot(data=state, aes(x=long, y=lat, fill= change_Government_1519, group=group)) +
scale_fill_gradient(low = "grey", high = "red", limits = c(0,50)) +
geom_polygon(color = "white") +
ggtitle('Change in # Government Treatment Centers from 2015 - 2019') +
coord_fixed(1.3)

# state_name <- "California"
#
# par(mar=c(5, 4, 4, 12), xpd=TRUE)
#
# plot(yearly_nsduh_data[yearly_nsduh_data$stname == state & yearly_nsduh_data$outcome == "UDPYILAL", ]$start_year, yearly_nsduh_data[yearly_nsduh_data$stname == state & yearly_nsduh_data$outcome == "UDPYILAL", ]$est_total, type = 'o', ylim= c(0,10000), main = paste(state, "Substance Use Measures Over Time") ,xlab = "year", ylab = "Thousands of People in 2015-18")
#
# lines(yearly_nsduh_data[yearly_nsduh_data$stname == state & yearly_nsduh_data$outcome == "ILLEMMON", ]$start_year, yearly_nsduh_data[yearly_nsduh_data$stname == state & yearly_nsduh_data$outcome == "ILLEMMON", ]$est_total, col = 'blue', type = 'o')
#
# lines(yearly_nsduh_data[yearly_nsduh_data$stname == state & yearly_nsduh_data$outcome == "BNGDRK", ]$start_year, yearly_nsduh_data[yearly_nsduh_data$stname == state & yearly_nsduh_data$outcome == "BNGDRK", ]$est_total, col = 'red', type = 'o')
#
# lines(yearly_nsduh_data[yearly_nsduh_data$stname == state & yearly_nsduh_data$outcome == "TXNPILAL", ]$start_year, yearly_nsduh_data[yearly_nsduh_data$stname == state & yearly_nsduh_data$outcome == "TXNPILAL", ]$est_total, col = 'green', type = 'o')
#
# lines(yearly_nsduh_data[yearly_nsduh_data$stname == state & yearly_nsduh_data$outcome == "PNRNMYR", ]$start_year, yearly_nsduh_data[yearly_nsduh_data$stname == state & yearly_nsduh_data$outcome == "PNRNMYR", ]$est_total, col = 'purple', type = 'o')
#
# legend('topright',legend = c("Substance Use Disorder", "Drug use in past month", "Binge alcohol use", "Needing, not receiving tx", "pain reliever misuse"), col = c("black", "blue", "red", "green", "purple"), lty = 1, inset=c(-0.5, 0))
plot(state_final_df[state_final_df$profit_type == "Forprofit", ]$sud, state_final_df[state_final_df$profit_type == "Forprofit",]$tx_percapita, xlab = "Substance Use Disorder Counts (in thousands)", ylab = "For profit treatment centers per capita")
abline(lm(tx_percapita ~ sud, data = state_final_df[state_final_df$profit_type == "Forprofit", ]))

plot(state_final_df[state_final_df$profit_type == "Nonprofit", ]$sud, state_final_df[state_final_df$profit_type == "Nonprofit",]$tx_percapita, col = "blue", xlab = "Substance Use Disorder counts (in thousands) ", ylab = "Nonprofit treatment centers per capita")
abline(lm(tx_percapita ~ sud, data = state_final_df[state_final_df$profit_type == "Nonprofit", ]))

plot(state_final_df[state_final_df$profit_type == "Government", ]$sud, state_final_df[state_final_df$profit_type == "Government",]$tx_percapita, col = "green", xlab = "Substance Use Disorder counts (in thousands)", ylab = "Government treatment centers per capita")
abline(lm(tx_percapita ~ sud, data = state_final_df[state_final_df$profit_type == "Government", ]))

NA
NA
Look at the distribution of each input variable of interest at each
point in time
create_hist <- function(df, year_param, profit_param, col_param)
{
subset_data <- df %>%
filter(year == year_param, profit_type == profit_param)
hist(subset_data[[col_param]], main = paste("Histogram of ",year_param, profit_param, col_param), xlab = col_param)
}
#substance use measures are roughly normal
create_hist(state_final_df, 2016, "Forprofit", "binge_drinking")

create_hist(state_final_df, 2017, "Forprofit", "binge_drinking")

create_hist(state_final_df, 2018, "Forprofit", "binge_drinking")

create_hist(state_final_df, 2019, "Forprofit", "binge_drinking")

create_hist(state_final_df, 2016, "Forprofit", "illicit_druguse")

create_hist(state_final_df, 2017, "Forprofit", "illicit_druguse")

create_hist(state_final_df, 2018, "Forprofit", "illicit_druguse")

create_hist(state_final_df, 2019, "Forprofit", "illicit_druguse")

create_hist(state_final_df, 2016, "Forprofit", "needing_tx")

create_hist(state_final_df, 2017, "Forprofit", "needing_tx")

create_hist(state_final_df, 2018, "Forprofit", "needing_tx")

create_hist(state_final_df, 2019, "Forprofit", "needing_tx")

create_hist(state_final_df, 2016, "Forprofit", "sud")

create_hist(state_final_df, 2017, "Forprofit", "sud")

create_hist(state_final_df, 2018, "Forprofit", "sud")

create_hist(state_final_df, 2019, "Forprofit", "sud")

#for profit treatment centers are heavily skewed for all years
create_hist(state_final_df, 2016, "Forprofit", "count_tx")

create_hist(state_final_df, 2017, "Forprofit", "count_tx")

create_hist(state_final_df, 2018, "Forprofit", "count_tx")

create_hist(state_final_df, 2019, "Forprofit", "count_tx")

#non profit treatment centers are heavily skewed for all years
create_hist(state_final_df, 2016, "Nonprofit", "count_tx")

create_hist(state_final_df, 2017, "Nonprofit", "count_tx")

create_hist(state_final_df, 2018, "Nonprofit", "count_tx")

create_hist(state_final_df, 2019, "Nonprofit", "count_tx")

#government treatment centers are heavily skewed for all years
create_hist(state_final_df, 2016, "Government", "count_tx")

create_hist(state_final_df, 2017, "Government", "count_tx")

create_hist(state_final_df, 2018, "Government", "count_tx")

create_hist(state_final_df, 2019, "Government", "count_tx")

Investigating treatment centers per capita over time and by profit
status
forprofit_txpercapita <- state_final_df %>%
filter(profit_type == "Forprofit") %>%
group_by(year) %>%
summarize(avg_tx_percapita = mean(tx_percapita, na.rm = TRUE),
min_tx_percapita = min(tx_percapita, na.rm = TRUE),
max_tx_percapita = max(tx_percapita, na.rm = TRUE))
nonprofit_txpercapita <- state_final_df %>%
filter(profit_type == "Nonprofit") %>%
group_by(year) %>%
summarize(avg_tx_percapita = mean(tx_percapita, na.rm = TRUE),
min_tx_percapita = min(tx_percapita, na.rm = TRUE),
max_tx_percapita = max(tx_percapita, na.rm = TRUE))
govt_txpercapita <- state_final_df %>%
filter(profit_type == "Government") %>%
group_by(year) %>%
summarize(avg_tx_percapita = mean(tx_percapita, na.rm = TRUE),
min_tx_percapita = min(tx_percapita, na.rm = TRUE),
max_tx_percapita = max(tx_percapita, na.rm = TRUE))
plot(forprofit_txpercapita$year, forprofit_txpercapita$avg_tx_percapita, type = 'o', col = 'blue', ylim = c(0.000001, 0.00005))
#lines(forprofit_txpercapita$year, forprofit_txpercapita$min_tx_percapita, type = 'l', col = 'blue', lty = 2)
#lines(forprofit_txpercapita$year, forprofit_txpercapita$max_tx_percapita, type = 'l', col = 'blue', lty = 2)
lines(nonprofit_txpercapita$year, nonprofit_txpercapita$avg_tx_percapita, type = 'o', col = 'red' )
#lines(nonprofit_txpercapita$year, nonprofit_txpercapita$min_tx_percapita, type = 'l', col = 'red', lty = 2 )
#lines(nonprofit_txpercapita$year, nonprofit_txpercapita$max_tx_percapita, type = 'l', col = 'red', lty = 2 )
lines(govt_txpercapita$year, govt_txpercapita$avg_tx_percapita, type = 'o', col = 'green')

#lines(govt_txpercapita$year, govt_txpercapita$min_tx_percapita, type = 'l', col = 'green', lty = 2)
#lines(govt_txpercapita$year, govt_txpercapita$max_tx_percapita, type = 'l', col = 'green', lty = 2)
state_final_df[is.na(state_final_df$tx_percapita),]
#Connecticut, South Carolina, Ohio, Mississippi, New York have the fewest for-profit treatment centers per capita
#Maine, Idaho, Utah, North Dakota, Kentucky have the most for-profit treatment centers per capita
state_final_df %>%
group_by(State_Abbrev, profit_type) %>%
summarize(avg_txpercapita = mean(tx_percapita, na.rm = TRUE)) %>%
filter(profit_type == "Forprofit") %>%
arrange(avg_txpercapita) %>%
ungroup() %>%
left_join(state, by = c("State_Abbrev" = "state_abbrevs")) %>%
ggplot(aes(x=long, y=lat, fill= avg_txpercapita, group=group)) +
scale_fill_gradient(low = "grey", high = "green", limits = c(0,0.0001)) +
geom_polygon(color = "white") +
ggtitle('Average Nonprofit Treatment Centers per capita 2015-2019') +
coord_fixed(1.3)
`summarise()` has grouped output by 'State_Abbrev'. You can override using the `.groups` argument.

#Virginia, Texas, Idaho, South Carolina, Georgia have the fewest non-profit treatment centers per capita
#Hawaii, Alaska, Maine, Vermont, Wyoming have the most non-profit treatment centers per capita
state_final_df %>%
group_by(State_Abbrev, profit_type) %>%
summarize(avg_txpercapita = mean(tx_percapita, na.rm = TRUE)) %>%
filter(profit_type == "Nonprofit") %>%
arrange(avg_txpercapita) %>%
ungroup() %>%
left_join(state, by = c("State_Abbrev" = "state_abbrevs")) %>%
ggplot(aes(x=long, y=lat, fill= avg_txpercapita, group=group)) +
scale_fill_gradient(low = "grey", high = "blue", limits = c(0,0.0001)) +
geom_polygon(color = "white") +
ggtitle('Average Nonprofit Treatment Centers per capita 2015-2019') +
coord_fixed(1.3)
`summarise()` has grouped output by 'State_Abbrev'. You can override using the `.groups` argument.

#New Hampshire, Tennessee, Pennsylvania, Delaware, Kentucky have the fewest government treatment centers per capita
#Alaska, North Dakota, South Dakota, Wyoming, New Mexico have the most government treatment centers per capita
state_final_df %>%
group_by(State_Abbrev, profit_type) %>%
summarize(avg_txpercapita = mean(tx_percapita, na.rm = TRUE)) %>%
filter(profit_type == "Government") %>%
arrange(avg_txpercapita) %>%
ungroup() %>%
left_join(state, by = c("State_Abbrev" = "state_abbrevs")) %>%
ggplot(aes(x=long, y=lat, fill= avg_txpercapita, group=group)) +
scale_fill_gradient(low = "grey", high = "red", limits = c(0,0.00005)) +
geom_polygon(color = "white") +
ggtitle('Average government Treatment Centers per capita 2015-2019') +
coord_fixed(1.3)
`summarise()` has grouped output by 'State_Abbrev'. You can override using the `.groups` argument.

state_final_df %>%
group_by(State_Name, profit_type) %>%
pivot_wider(id_cols = c(State_Name, profit_type), names_from = year, values_from = count_tx, names_prefix = "yr") %>%
mutate(change_2016_17 = yr2017 - yr2016,
change_2017_18 = yr2018 - yr2017,
change_2018_19 = yr2019 - yr2018,
total_change = yr2019 - yr2017)
#mutate(increase = ifelse())
state_yearly_averages<- state_final_df %>%
distinct(State_Abbrev, profit_type, year, .keep_all = TRUE) %>%
group_by(year) %>%
summarize(avg_druguse = mean(illicit_druguse),
avg_drinking = mean(binge_drinking),
avg_od = mean(RATE)) %>%
arrange(year)
plot(state_yearly_averages$year, state_yearly_averages$avg_druguse, type = 'l')

plot(state_yearly_averages$year, state_yearly_averages$avg_drinking, type = 'l')

plot(state_yearly_averages$avg_od, state_yearly_averages$avg_od, type = 'l')

---
title: "SAMHSA NSSATS Exploratory Data Analysis"
output: html_notebook
---

```{r}

years <- seq(2014, 2019) 
#years <- seq(2014, 2020)
forprofit_counts <- profit_states_1419 %>% 
  dplyr::select(contains("count_Forprofit_")) %>% 
  summarize(across(everything(), sum, na.rm = TRUE))

nonprofit_counts <- profit_states_1419 %>% 
  dplyr::select(contains("count_Nonprofit_")) %>% 
  summarize(across(everything(), sum, na.rm = TRUE))

govt_counts <- profit_states_1419 %>% 
  dplyr::select(contains("count_Government_")) %>% 
  summarize(across(everything(), sum, na.rm = TRUE))


plot(years, forprofit_counts, type = 'l', main= "Treatment Facility Counts over time", ylim = c(1000, 8000), col = 'blue', xlab = 'Year', ylab = '# Treatment Centers')
lines(years, nonprofit_counts, type = 'l', main= "Non Profit Counts", col = 'red')
lines(years, govt_counts, type = 'l', main= "Government Counts", col = 'green')

legend("bottomleft", c("for profit", "non profit", "government"), col = c("blue", "red", "green"), pch ='-', lwd = 2)

```


using map tutorial from: https://jtr13.github.io/cc19/different-ways-of-plotting-u-s-map-in-r.html#using-ggplot2-package
state abbreviations from: https://www.ssa.gov/international/coc-docs/states.html

```{r}
library(ggplot2)
library(maps)
library(mapdata)
library(plotly)
library(sf)

usa <- map_data('usa')
state <- map_data("state")

#merge state geo file to the for-profit information
state <- state %>% 
              left_join(state_abbrevs_df, by = c("region" = "state_names")) %>% 
              left_join(profit_states_1419, by = c('state_abbrevs' = 'STATE')) 
                                      
```




```{r}
#plot the number of for-profit treatment centers from 2015-2019
for (i in seq(2015, 2019))
{
  plot_forprofits <- ggplot(data=state, aes(x=long, y=lat, fill= !!sym(paste0("count_Forprofit_", i, sep = "")), group=group)) + 
                      scale_fill_gradient(low = "grey", high = "green", limits = c(0,800)) +
                      geom_polygon(color = "white") + 
                      ggtitle(paste('# For-Profit Treatment Centers in', i)) + 
                      coord_fixed(1.3)
  print(plot_forprofits)
}


ggplot(data=state, aes(x=long, y=lat, fill= change_Forprofit_1519, group=group)) + 
                      scale_fill_gradient(low = "grey", high = "green", limits = c(0,400)) +
                      geom_polygon(color = "white") + 
                      ggtitle('Change in # For-Profit Treatment Centers from 2015 - 2019') + 
                      coord_fixed(1.3)

```


```{r}
for (i in seq(2015, 2019))
{
  plot_nonprofits <- ggplot(data=state, aes(x=long, y=lat, fill= !!sym(paste0("count_Nonprofit_", i, sep = "")), group=group)) + 
                      scale_fill_gradient(low = "grey", high = "blue", limits = c(0,900)) +
                      geom_polygon(color = "white") + 
                      ggtitle(paste('# Non-Profit Treatment Centers in', i)) + 
                      coord_fixed(1.3)
  print(plot_nonprofits)
}


ggplot(data=state, aes(x=long, y=lat, fill= change_Nonprofit_1519, group=group)) + 
                      scale_fill_gradient(low = "grey", high = "blue", limits = c(0,100)) +
                      geom_polygon(color = "white") + 
                      ggtitle('Change in # Non-Profit Treatment Centers from 2015 - 2019') + 
                      coord_fixed(1.3)
```

```{r}
for (i in seq(2015, 2019))
{
  plot_govt <- ggplot(data=state, aes(x=long, y=lat, fill= !!sym(paste0("count_Government_", i, sep = "")), group=group)) + 
                      scale_fill_gradient(low = "grey", high = "red", limits = c(0,250)) +
                      geom_polygon(color = "white") + 
                      ggtitle(paste('# Government Treatment Centers in', i)) + 
                      coord_fixed(1.3)
  print(plot_govt)
}



ggplot(data=state, aes(x=long, y=lat, fill= change_Government_1519, group=group)) + 
                      scale_fill_gradient(low = "grey", high = "red", limits = c(0,50)) +
                      geom_polygon(color = "white") + 
                      ggtitle('Change in # Government Treatment Centers from 2015 - 2019') + 
                      coord_fixed(1.3)
```



```{r}

# state_name <- "California"
# 
# par(mar=c(5, 4, 4, 12), xpd=TRUE)
# 
# plot(yearly_nsduh_data[yearly_nsduh_data$stname == state & yearly_nsduh_data$outcome == "UDPYILAL", ]$start_year, yearly_nsduh_data[yearly_nsduh_data$stname == state & yearly_nsduh_data$outcome == "UDPYILAL", ]$est_total, type = 'o', ylim= c(0,10000), main = paste(state, "Substance Use Measures Over Time") ,xlab = "year", ylab = "Thousands of People in 2015-18")
# 
# lines(yearly_nsduh_data[yearly_nsduh_data$stname == state & yearly_nsduh_data$outcome == "ILLEMMON", ]$start_year, yearly_nsduh_data[yearly_nsduh_data$stname == state & yearly_nsduh_data$outcome == "ILLEMMON", ]$est_total, col = 'blue', type = 'o')
# 
# lines(yearly_nsduh_data[yearly_nsduh_data$stname == state & yearly_nsduh_data$outcome == "BNGDRK", ]$start_year, yearly_nsduh_data[yearly_nsduh_data$stname == state & yearly_nsduh_data$outcome == "BNGDRK", ]$est_total, col = 'red', type = 'o')
# 
# lines(yearly_nsduh_data[yearly_nsduh_data$stname == state & yearly_nsduh_data$outcome == "TXNPILAL", ]$start_year, yearly_nsduh_data[yearly_nsduh_data$stname == state & yearly_nsduh_data$outcome == "TXNPILAL", ]$est_total, col = 'green', type = 'o')
# 
# lines(yearly_nsduh_data[yearly_nsduh_data$stname == state & yearly_nsduh_data$outcome == "PNRNMYR", ]$start_year, yearly_nsduh_data[yearly_nsduh_data$stname == state & yearly_nsduh_data$outcome == "PNRNMYR", ]$est_total, col = 'purple', type = 'o')
# 
# legend('topright',legend = c("Substance Use Disorder", "Drug use in past month", "Binge alcohol use", "Needing, not receiving tx", "pain reliever misuse"), col = c("black", "blue", "red", "green", "purple"), lty = 1,  inset=c(-0.5, 0))

```


```{r}

plot(state_final_df[state_final_df$profit_type == "Forprofit", ]$sud, state_final_df[state_final_df$profit_type == "Forprofit",]$tx_percapita, xlab = "Substance Use Disorder Counts (in thousands)", ylab = "For profit treatment centers per capita")
abline(lm(tx_percapita ~ sud, data = state_final_df[state_final_df$profit_type == "Forprofit", ]))


plot(state_final_df[state_final_df$profit_type == "Nonprofit", ]$sud, state_final_df[state_final_df$profit_type == "Nonprofit",]$tx_percapita, col = "blue", xlab = "Substance Use Disorder counts (in thousands) ", ylab = "Nonprofit treatment centers per capita")
abline(lm(tx_percapita ~ sud, data = state_final_df[state_final_df$profit_type == "Nonprofit", ]))

plot(state_final_df[state_final_df$profit_type == "Government", ]$sud, state_final_df[state_final_df$profit_type == "Government",]$tx_percapita, col = "green", xlab = "Substance Use Disorder counts (in thousands)", ylab = "Government treatment centers per capita")
abline(lm(tx_percapita ~ sud, data = state_final_df[state_final_df$profit_type == "Government", ]))


```



Look at the distribution of each input variable of interest at each point in time
```{r}

create_hist <- function(df, year_param, profit_param, col_param)
{
  subset_data <- df %>%
  filter(year == year_param, profit_type == profit_param)
  hist(subset_data[[col_param]], main = paste("Histogram of ",year_param, profit_param, col_param), xlab = col_param)
}

```

```{r}

#substance use measures are roughly normal
create_hist(state_final_df, 2016, "Forprofit", "binge_drinking")
create_hist(state_final_df, 2017, "Forprofit", "binge_drinking")
create_hist(state_final_df, 2018, "Forprofit", "binge_drinking")
create_hist(state_final_df, 2019, "Forprofit", "binge_drinking")

create_hist(state_final_df, 2016, "Forprofit", "illicit_druguse")
create_hist(state_final_df, 2017, "Forprofit", "illicit_druguse")
create_hist(state_final_df, 2018, "Forprofit", "illicit_druguse")
create_hist(state_final_df, 2019, "Forprofit", "illicit_druguse")

create_hist(state_final_df, 2016, "Forprofit", "needing_tx")
create_hist(state_final_df, 2017, "Forprofit", "needing_tx")
create_hist(state_final_df, 2018, "Forprofit", "needing_tx")
create_hist(state_final_df, 2019, "Forprofit", "needing_tx")

create_hist(state_final_df, 2016, "Forprofit", "sud")
create_hist(state_final_df, 2017, "Forprofit", "sud")
create_hist(state_final_df, 2018, "Forprofit", "sud")
create_hist(state_final_df, 2019, "Forprofit", "sud")
```
```{r}
#for profit treatment centers are heavily skewed for all years
create_hist(state_final_df, 2016, "Forprofit", "count_tx")
create_hist(state_final_df, 2017, "Forprofit", "count_tx")
create_hist(state_final_df, 2018, "Forprofit", "count_tx")
create_hist(state_final_df, 2019, "Forprofit", "count_tx")
```

```{r}
#non profit treatment centers are heavily skewed for all years
create_hist(state_final_df, 2016, "Nonprofit", "count_tx")
create_hist(state_final_df, 2017, "Nonprofit", "count_tx")
create_hist(state_final_df, 2018, "Nonprofit", "count_tx")
create_hist(state_final_df, 2019, "Nonprofit", "count_tx")
```
```{r}
#government treatment centers are heavily skewed for all years
create_hist(state_final_df, 2016, "Government", "count_tx")
create_hist(state_final_df, 2017, "Government", "count_tx")
create_hist(state_final_df, 2018, "Government", "count_tx")
create_hist(state_final_df, 2019, "Government", "count_tx")
```



Investigating treatment centers per capita over time and by profit status
```{r}
forprofit_txpercapita <- state_final_df %>% 
  filter(profit_type == "Forprofit") %>% 
  group_by(year) %>%
  summarize(avg_tx_percapita = mean(tx_percapita, na.rm = TRUE),
            min_tx_percapita = min(tx_percapita, na.rm = TRUE),
            max_tx_percapita = max(tx_percapita, na.rm = TRUE))
  
nonprofit_txpercapita <- state_final_df %>% 
  filter(profit_type == "Nonprofit") %>% 
  group_by(year) %>%
  summarize(avg_tx_percapita = mean(tx_percapita, na.rm = TRUE),
            min_tx_percapita = min(tx_percapita, na.rm = TRUE),
            max_tx_percapita = max(tx_percapita, na.rm = TRUE))

govt_txpercapita <- state_final_df %>% 
  filter(profit_type == "Government") %>% 
  group_by(year) %>%
  summarize(avg_tx_percapita = mean(tx_percapita, na.rm = TRUE),
            min_tx_percapita = min(tx_percapita, na.rm = TRUE),
            max_tx_percapita = max(tx_percapita, na.rm = TRUE))



plot(forprofit_txpercapita$year, forprofit_txpercapita$avg_tx_percapita, type = 'o', col = 'blue', ylim = c(0.000001, 0.00005))
#lines(forprofit_txpercapita$year, forprofit_txpercapita$min_tx_percapita, type = 'l', col = 'blue', lty = 2)
#lines(forprofit_txpercapita$year, forprofit_txpercapita$max_tx_percapita, type = 'l', col = 'blue', lty = 2)

lines(nonprofit_txpercapita$year, nonprofit_txpercapita$avg_tx_percapita, type = 'o', col = 'red' )
#lines(nonprofit_txpercapita$year, nonprofit_txpercapita$min_tx_percapita, type = 'l', col = 'red', lty = 2 )
#lines(nonprofit_txpercapita$year, nonprofit_txpercapita$max_tx_percapita, type = 'l', col = 'red', lty = 2 )

lines(govt_txpercapita$year, govt_txpercapita$avg_tx_percapita, type = 'o', col = 'green')
#lines(govt_txpercapita$year, govt_txpercapita$min_tx_percapita, type = 'l', col = 'green', lty = 2)
#lines(govt_txpercapita$year, govt_txpercapita$max_tx_percapita, type = 'l', col = 'green', lty = 2)


```
```{r}
state_final_df[is.na(state_final_df$tx_percapita),]
```

```{r}
#Connecticut, South Carolina, Ohio, Mississippi, New York have the fewest for-profit treatment centers per capita
#Maine, Idaho, Utah, North Dakota, Kentucky have the most for-profit treatment centers per capita 
state_final_df %>%
  group_by(State_Abbrev, profit_type) %>%
  summarize(avg_txpercapita = mean(tx_percapita, na.rm = TRUE)) %>%
  filter(profit_type == "Forprofit") %>%
  arrange(avg_txpercapita) %>%
  ungroup() %>%
  left_join(state, by = c("State_Abbrev" = "state_abbrevs")) %>%
ggplot(aes(x=long, y=lat, fill= avg_txpercapita, group=group)) + 
                      scale_fill_gradient(low = "grey", high = "green", limits = c(0,0.0001)) +
                      geom_polygon(color = "white") + 
                      ggtitle('Average Nonprofit Treatment Centers per capita 2015-2019') + 
                      coord_fixed(1.3)
```

```{r}
#Virginia, Texas, Idaho, South Carolina, Georgia have the fewest non-profit treatment centers per capita
#Hawaii, Alaska, Maine, Vermont, Wyoming have the most non-profit treatment centers per capita 
state_final_df %>%
  group_by(State_Abbrev, profit_type) %>%
  summarize(avg_txpercapita = mean(tx_percapita, na.rm = TRUE)) %>%
  filter(profit_type == "Nonprofit") %>%
  arrange(avg_txpercapita) %>%
  ungroup() %>%
  left_join(state, by = c("State_Abbrev" = "state_abbrevs")) %>%
ggplot(aes(x=long, y=lat, fill= avg_txpercapita, group=group)) + 
                      scale_fill_gradient(low = "grey", high = "blue", limits = c(0,0.0001)) +
                      geom_polygon(color = "white") + 
                      ggtitle('Average Nonprofit Treatment Centers per capita 2015-2019') + 
                      coord_fixed(1.3)
```

```{r}
#New Hampshire, Tennessee, Pennsylvania, Delaware, Kentucky have the fewest government treatment centers per capita
#Alaska, North Dakota, South Dakota, Wyoming, New Mexico have the most government treatment centers per capita 
state_final_df %>%
  group_by(State_Abbrev, profit_type) %>%
  summarize(avg_txpercapita = mean(tx_percapita, na.rm = TRUE)) %>%
  filter(profit_type == "Government") %>%
  arrange(avg_txpercapita) %>%
  ungroup() %>%
  left_join(state, by = c("State_Abbrev" = "state_abbrevs")) %>%
ggplot(aes(x=long, y=lat, fill= avg_txpercapita, group=group)) + 
                      scale_fill_gradient(low = "grey", high = "red", limits = c(0,0.00005)) +
                      geom_polygon(color = "white") + 
                      ggtitle('Average government Treatment Centers per capita 2015-2019') + 
                      coord_fixed(1.3)
```

```{r}
state_final_df %>%
  group_by(State_Name, profit_type) %>%
  pivot_wider(id_cols = c(State_Name, profit_type), names_from = year, values_from = count_tx, names_prefix = "yr") %>%
  mutate(change_2016_17 = yr2017 - yr2016,
         change_2017_18 = yr2018 - yr2017,
         change_2018_19 = yr2019 - yr2018, 
         total_change = yr2019 - yr2017)
  #mutate(increase = ifelse())
```

```{r}

state_yearly_averages<- state_final_df %>%
  distinct(State_Abbrev, profit_type, year, .keep_all = TRUE) %>%
  group_by(year) %>%
  summarize(avg_druguse = mean(illicit_druguse),
         avg_drinking = mean(binge_drinking),
         avg_od = mean(RATE)) %>%
  arrange(year)

plot(state_yearly_averages$year, state_yearly_averages$avg_druguse, type = 'l')
plot(state_yearly_averages$year, state_yearly_averages$avg_drinking, type = 'l')
plot(state_yearly_averages$avg_od, state_yearly_averages$avg_od, type = 'l')

```

